#delim cr
set more off
set varabbrev off
pause on
graph set ps logo off

capture log close
set linesize 200
set logtype text
log using ../log/construct-daily-dataset.log , replace

/* --------------------------------------

Put together a dataset with daily 
productivity and all of our other variables.

--------------------------------------- */

clear all
estimates clear

************************************************************
**   Bring in the hotel data
************************************************************

use ../dta/ctripHTL.dta , clear
d, f

duplicates report
duplicates drop

collapse (sum) NumCall OrderSubmitted OrderCompleted RecommendScore NumNotification NumRoomFullOrderChange NoCall OutCall CallLengthsecond LogInLengthsecond OrderHandleOverTime ///
(first) EmployeeGroup GroupType EmployeeType Class Group WorkPlace  ///
, by(EmployeeID StatDate) fast

gen hotel = 1

tempfile hotel
save `hotel'

************************************************************
**   Bring in the flight data
************************************************************

use ../dta/ctripFLT.dta , clear
d, f

duplicates report
duplicates drop

collapse (sum) NumCall NumTicket TicketScore NumNotification NumInsurance RecommendScore LogInLengthsecond CallLengthsecond ///
(first) EmployeeGroup GroupType EmployeeType Class Group WorkPlace week ///
, by(EmployeeID StatDate) fast

gen byte hotel = 0 

tempfile flight
save `flight'

************************************************************
**   Combine and describe hotel and flight data
************************************************************

use `flight' , clear
append using `hotel'

tab hotel , miss

** Note that many outcomes exist in one set of workers
** and not the other:
sum if hotel == 0
sum if hotel == 1

rename StatDate Date
codebook Date

************************************************************
**   Describe categorical variables
************************************************************

tab1 EmployeeGroup GroupType EmployeeType Class Group WorkPlace , miss sort

************************************************************
**   Save memory by eliminating strings
************************************************************

d, f

** Many of these strings can be removed to save space
codebook EmployeeID EmployeeGroup GroupType EmployeeType Class Group WorkPlace 

encode GroupType , gen(group_type)
encode EmployeeType , gen(employee_type)
encode Class , gen(class)
drop GroupType EmployeeType Class

tab Group , miss
gen treatment_group = (Group == "treatmentGroup")
replace treatment_group = . if (Group == "")
drop Group

tab WorkPlace , miss
gen byte at_home = (WorkPlace == "home")
replace at_home = . if WorkPlace == ""
drop WorkPlace

drop week

************************************************************
**   Create fixed effects
************************************************************

gen yearmonth = ym( year(Date) , month(Date) )
format yearmonth %tm
codebook yearmonth

gen day_of_week = dow(Date)
tab day_of_week

************************************************************
**   Describe data across two samples
************************************************************

sort EmployeeID Date hotel
list hotel Date `outcomes' if EmployeeID == "S02727" 
list hotel Date `outcomes' if EmployeeID == "N04543" 
list hotel Date `outcomes' if EmployeeID == "N08885" 
list hotel Date `outcomes' if EmployeeID == "S12536" 
list hotel Date `outcomes' if EmployeeID == "S20628" 

************************************************************
**   Describe data within one sample (hotel)
************************************************************

local sunday1 = mdy(1,6,1991)
gen week = floor((Date - `sunday1')/7)

sort EmployeeID Date 
list hotel Date day_of_week `outcomes' if EmployeeID == "S02727" , sepby(week)
list hotel Date day_of_week `outcomes' if EmployeeID == "N04543" , sepby(week)
list hotel Date day_of_week `outcomes' if EmployeeID == "N08885" , sepby(week)
list hotel Date day_of_week `outcomes' if EmployeeID == "S12536" , sepby(week)
list hotel Date day_of_week `outcomes' if EmployeeID == "S20628" , sepby(week)

drop week

************************************************************
**   Make sure all employee ID's are formatted properly
************************************************************

codebook EmployeeID 
gen str1 first_initial = substr(EmployeeID,1,1)
tab first_initial , miss
drop first_initial

** We assume that lower-case first initials are a mistake here.

replace EmployeeID = upper(EmployeeID)

codebook EmployeeID 
gen str1 first_initial = substr(EmployeeID,1,1)
tab first_initial , miss
drop first_initial

************************************************************
**   Create a year_week variable
************************************************************

** The BLRY group created their own year_week variable. Here we create
** a mapping from date to that week variable. Note that this isn't as simple
** as just using the Stata week(.) function. In fact, this has taken some time,
** since they used a very particular definition of week.

preserve
	** Create an observation for every date
	clear
	set obs 10000
	gen Date = _n + mdy(1, 1, 2007)
	format Date %td
	codebook Date

	** I create a week indicator that begins each week on 
	** Monday, rather than stata's lousy week(.) function
	** The numbers here are arbitrary, but it does divide up
	** days by weeks that start on Monday.
	** In China, the week starts on Monday, not Sunday.
	local monday1 = mdy(1, 7, 1991)
	gen week = floor((Date - `monday1')/7)

	gen year = year(Date)
	egen year_at_start_of_week = min(year) , by(week)
	egen min_week = min(week) , by(year_at_start_of_week)
	gen year_week = .
	replace year_week = year_at_start_of_week * 100 + (week - min_week + 2) if year_at_start_of_week == 2010
	replace year_week = year_at_start_of_week * 100 + (week - min_week + 1) if year_at_start_of_week >= 2011
	replace year_week = 201001 if Date == mdy(1, 1, 2010)
	replace year_week = 201001 if Date == mdy(1, 2, 2010)
	replace year_week = 201001 if Date == mdy(1, 3, 2010)

	sort Date
	list Date year_week if year(Date) >= 2010 & year(Date) <= 2011 , sepby(week)

	keep Date year_week
	compress

	tempfile week_mapping
	save `week_mapping'
restore

merge m:1 Date using `week_mapping'
codebook Date
drop if _merge == 2
drop _merge

************************************************************
**   Describe those outcomes by hand in the BLRY data
************************************************************

preserve

	use ../src/bloom-data/performance_during_exper.dta , clear
	d, f
	tab year_week 

	gen EmployeeID_stub = personid / 2 - 1000
	gen str10 EmployeeID = "S" + string(EmployeeID_stub) if EmployeeID_stub > 10000
	replace EmployeeID = "S" + "0" + string(EmployeeID_stub) if EmployeeID_stub < 10000 & EmployeeID_stub > 1000
	replace EmployeeID = "S" + "00" + string(EmployeeID_stub) if EmployeeID_stub < 1000 & EmployeeID_stub > 100
	replace EmployeeID = "S" + "000" + string(EmployeeID_stub) if EmployeeID_stub < 100 & EmployeeID_stub > 10
	codebook EmployeeID

	sort EmployeeID year_week

	list year_week phonecallraw if EmployeeID == "S05990"
	list year_week phonecallraw if EmployeeID == "S06024"
	list year_week phonecallraw if EmployeeID == "S06133"
	list year_week phonecallraw if EmployeeID == "S06362"
	list year_week phonecallraw if EmployeeID == "S06647"
	list year_week phonecallraw if EmployeeID == "S07052"
	list year_week phonecallraw if EmployeeID == "S07259"
	list year_week phonecallraw if EmployeeID == "S07406"
	list year_week phonecallraw if EmployeeID == "S07507"

restore

************************************************************
**   Describe some outcomes by hand
************************************************************

** We do this here to compare to the weekly data
preserve

	sort EmployeeID Date

	list Date NumCall if EmployeeID == "S05990" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S06024" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S06133" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S06362" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S06647" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S07052" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S07259" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S07406" & year(Date) == 2010
	list Date NumCall if EmployeeID == "S07507" & year(Date) == 2010


	collapse (sum) NumCall , by(EmployeeID year_week) fast

	list year_week NumCall if EmployeeID == "S05990" 
	list year_week NumCall if EmployeeID == "S06024" 
	list year_week NumCall if EmployeeID == "S06133" 
	list year_week NumCall if EmployeeID == "S06362" 
	list year_week NumCall if EmployeeID == "S06647" 
	list year_week NumCall if EmployeeID == "S07052" 
	list year_week NumCall if EmployeeID == "S07259" 
	list year_week NumCall if EmployeeID == "S07406" 
	list year_week NumCall if EmployeeID == "S07507" 

restore

************************************************************
**   Merge to time-invariant characteristics in BLRY data
************************************************************

sort EmployeeID
merge m:1 EmployeeID using ../dta/blry-time-invariant-controls.dta  
gen str1 first_initial = substr(EmployeeID,1,1)
tab first_initial _merge
table yearmonth _merge
tab EmployeeID if _merge == 3
drop if _merge == 2
drop _merge

************************************************************
**   Add in time-stamp data
************************************************************

** We have time stamps for each worker and day, which we merge
** in here.

describe using ../dta/reshaped-time-stamps.dta 

codebook EmployeeID
replace EmployeeID = upper(EmployeeID)
codebook EmployeeID

merge m:1 EmployeeID Date using ../dta/reshaped-time-stamps.dta 
drop if _merge == 2
table Date , c(mean _merge)
drop _merge

************************************************************
**   Study time-stamp data issues
************************************************************

** In the entire time-stamp data, about 5 percent of time stamps
** occur at 1 AM. This creates a problem for us, since 1 AM is the 
** next day. Here we check how many of our main workers are clocking
** in at strange hours.

gen byte no_logins = (hour1 == . & hour2 == . & hour3 == . & hour4 == .)
gen byte one_login = (hour1 != . & hour2 == . & hour3 == . & hour4 == .)
gen byte two_logins = (hour1 != . & hour2 != . & hour3 == . & hour4 == .)
gen byte three_logins = (hour1 != . & hour2 != . & hour3 != . & hour4 == .)
gen byte four_logins = (hour1 != . & hour2 != . & hour3 != . & hour4 != .)

table expgroup , c(mean no_logins mean one_login mean two_logins mean three_logins mean four_logins) 
table expgroup if NumCall > 0 & NumCall < . , c(mean no_logins mean one_login mean two_logins mean three_logins mean four_logins) 

sum *logins if expgroup == .
sum *logins if expgroup == . & NumCall > 0 & NumCall < .

drop no_logins one_login two_logins three_logins four_logins

************************************************************
**   Restrictions
************************************************************

codebook EmployeeID

drop if EmployeeID == ""

bysort EmployeeID: gen obs_per_id = _N
tab obs_per_id
drop obs_per_id

************************************************************
**   Save & Close
************************************************************

compress
save ../dta/ctrip-cleaned-daily-records.dta , replace

d, f
summarize

log close
exit

